Setup

# Also do Session > Set Working Directory > Choose Directory
knitr::opts_knit$set(root.dir = ".")
library(BiocParallel)  # SnowParam()
library(dplyr)  # left_join()
library(edgeR)  # DGEList()
library(limma)  # plotMDS()
library(ggrepel) # geom_text_repel()
library(ggplot2)  # ggplot()
library(gplots)  # heatmap.2()
library(grDevices)  # colorRampPalette()
#library(philentropy)  # JSD()
library(rtracklayer)  # import()
library(stringr)  # str_match()
library(variancePartition)  # fitExtractVarPartModel()
library(reshape)  # melt()
library(Glimma)
library(plyr)
library(corrplot)
library(ggpubr)

User defined variables

sex_karyotype <- "XY"
control <- "CONTROL"
condition <- "LBD"
control_color <- "gray29"
condition_color <- "red"
myContrasts <- c("LBD - CONTROL")
tool = c("star")
pathToRef = c("/research/labs/neurology/fryer/projects/references/human/")
pathToRawData = c("/research/labs/neurology/fryer/projects/LBD_CWOW/")

Read data

# read in metadata
metadata <- read.delim(paste0(pathToRawData, "metadata.tsv"), 
                       header = TRUE,
                       sep = "\t")

RIN <- read.delim(paste0(pathToRawData, "RIN.tsv"), 
                       header = TRUE,
                       sep = "\t")
RIN$NPID <- RIN$Sample
metadata <- merge(metadata, RIN, by = "NPID")
# read in counts data
counts <- read.delim(
  paste0("../../featureCounts/LBD_", sex_karyotype,".counts"),
  header = TRUE,
  sep = "\t", check.names=FALSE)
# get the sampleID from the counts file to match with metadata
sampleIDs <- colnames(counts)
samples <- as.data.frame(sampleIDs)
# split the sampleID by _ 
sample_count_id <- as.data.frame(str_split_fixed(samples$sampleID, "_", 2))
sample_count_id$counts_id <- samples$sampleIDs
# rename columns
names(sample_count_id)[names(sample_count_id) == 'V1'] <- 'NPID'
names(sample_count_id)[names(sample_count_id) == 'V2'] <- 'flow_lane'
# merge sample_count_id and metadata files
# this way metadata contains the sampleID to match the counts table 
counts_metadata <- merge(metadata, sample_count_id, by = "NPID")

# gene information
counts.geneid <- read.delim(
  "../../featureCounts/gene_info.txt",
  header = TRUE,
  sep = "\t"
)
counts.geneid <- as.data.frame(counts.geneid$Geneid)
colnames(counts.geneid) <- "gene_id"
# add gene_name as row names to counts file
rownames(counts) <- counts.geneid$gene_id

# read in annotation file
gtf.file <- paste0(pathToRef, "gencode.v38.annotation.gtf")
genes.gtf <- rtracklayer::import(gtf.file)
genes.gtf <- as.data.frame(genes.gtf)
genes.gtf <- genes.gtf[genes.gtf$type == "gene",]
table(genes.gtf$gene_type)
## 
##                          IG_C_gene                    IG_C_pseudogene 
##                                 14                                  9 
##                          IG_D_gene                          IG_J_gene 
##                                 37                                 18 
##                    IG_J_pseudogene                      IG_pseudogene 
##                                  3                                  1 
##                          IG_V_gene                    IG_V_pseudogene 
##                                145                                187 
##                             lncRNA                              miRNA 
##                              16888                               1879 
##                           misc_RNA                            Mt_rRNA 
##                               2212                                  2 
##                            Mt_tRNA             polymorphic_pseudogene 
##                                 22                                 49 
##               processed_pseudogene                     protein_coding 
##                              10163                              19955 
##                         pseudogene                           ribozyme 
##                                 15                                  8 
##                               rRNA                    rRNA_pseudogene 
##                                 47                                497 
##                             scaRNA                              scRNA 
##                                 49                                  1 
##                             snoRNA                              snRNA 
##                                943                               1901 
##                               sRNA                                TEC 
##                                  5                               1056 
##                          TR_C_gene                          TR_D_gene 
##                                  6                                  4 
##                          TR_J_gene                    TR_J_pseudogene 
##                                 79                                  4 
##                          TR_V_gene                    TR_V_pseudogene 
##                                106                                 33 
##   transcribed_processed_pseudogene     transcribed_unitary_pseudogene 
##                                502                                143 
## transcribed_unprocessed_pseudogene    translated_processed_pseudogene 
##                                950                                  2 
##  translated_unprocessed_pseudogene                 unitary_pseudogene 
##                                  1                                 98 
##             unprocessed_pseudogene                          vault_RNA 
##                               2614                                  1
#counts.geneid
#names(counts.geneid)[1] <- 'gene_name'
joined.df <- join(counts.geneid, genes.gtf, type = "inner")
## Joining by: gene_id
# check columns and rows match up between files
all.equal(rownames(counts), counts.geneid$gene_id)
## [1] TRUE
all.equal(rownames(counts), genes.gtf$gene_id)
## [1] TRUE
# reorder counts to be in the same order as metadata table
counts <- counts[, counts_metadata$counts_id]
all.equal(colnames(counts), (counts_metadata$counts_id))
## [1] TRUE

Create DGE object

# create object
dge <- DGEList(counts = counts,
               samples = counts_metadata,
               genes = genes.gtf)

table(dge$samples$TYPE)
## 
##      CONTROL CONTROL - AD CONTROL - PA          LBD 
##           16            6            6           78
# remove AD and PA controls
remove <- vector()
for (i in 1:nrow(dge$samples)) {
  if (dge$samples$TYPE[i] == "CONTROL - AD" | dge$samples$TYPE[i] == "CONTROL - PA") {
    remove <- c(remove, i)
  }
}
dge <- dge[,-remove, keep.lib.sizes = FALSE]
table(dge$samples$TYPE)
## 
## CONTROL     LBD 
##      16      78

column 1 is pid id column 15 is group information (LPS or control) column 18 is date of birth

saveRDS(dge, file = paste0("../../rObjects/", condition, "_", tool,"_", sex_karyotype, "_gene_raw.rds"))

Remove mitochondrial genes

dim(dge)
## [1] 60649    94
removeMT <- dge$genes$seqnames != "chrM"  # true when NOT MT
dge <- dge[removeMT,,keep.lib.sizes = FALSE]
dim(dge)
## [1] 60612    94

Save functions

These functions with help simultaneously save plots as a png, pdf, and tiff file.

saveToPDF <- function(...) {
    d = dev.copy(pdf,...)
    dev.off(d)
}

saveToPNG <- function(...) {
    d = dev.copy(png,...)
    dev.off(d)
}

Raw MDS with technical replicates

# set colors and get data
table(dge$samples$TYPE)
## 
## CONTROL     LBD 
##      16      78
dge$samples$TYPE <- as.factor(dge$samples$TYPE)
group_colors <- c("black","red")[dge$samples$TYPE]
lcpm <- edgeR::cpm(dge$counts, log = TRUE)
par(bg = 'white')

# plot MDS
plotMDS(
  lcpm,
  top = 100, 
  labels = dge$samples$NPID,
  cex = 0.8, 
  dim.plot = c(1,2), 
  plot = TRUE, 
  col = group_colors, 
  gene.selection = "common"
)
title(expression('Top 100 Genes - Raw (Log'[2]~'CPM)'))

#legend(
#  "bottomleft",
#  legend = c(control, condition),
#  pch = 16,
#  col = c("black","red"),
#  cex = 0.8
#)
path <- paste0("../../results/", tool, "/MDS/", condition,"_gene_MDS_dim1&2_techreps_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 4, height = 4)
## png 
##   2
saveToPNG(paste0(path, ".png"), width = 4, height = 4, unit = "in", res = 300)
## png 
##   2
# plot MDS
plotMDS(
  lcpm,
  top = 100, 
  labels = dge$samples$NPID,
  cex = 0.8, 
  dim.plot = c(2,3), 
  plot = TRUE, 
  col = group_colors, 
  gene.selection = "common"
)
title(expression('Top 100 Genes - Raw (Log'[2]~'CPM)'))

path <- paste0("../../results/", tool, "/MDS/", condition,"_gene_MDS_dim2&3_techreps_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 4, height = 4)
## png 
##   2
saveToPNG(paste0(path, ".png"), width = 4, height = 4, unit = "in", res = 300)
## png 
##   2

RIN check with replicates

# first filter by expression and normalize the data
keep.expr <- filterByExpr(dge, group = dge$samples$TYPE)
dim(dge)
## [1] 60612    94
dge.filtered.reps <- dge[keep.expr, , keep.lib.sizes = FALSE]

dim(dge.filtered.reps)
## [1] 17310    94
table(dge.filtered.reps$genes$gene_type)
## 
##                          IG_C_gene                             lncRNA 
##                                  1                               2413 
##                              miRNA                           misc_RNA 
##                                  5                                 11 
##             polymorphic_pseudogene               processed_pseudogene 
##                                  3                                153 
##                     protein_coding                             scaRNA 
##                              14207                                  1 
##                              scRNA                             snoRNA 
##                                  1                                  5 
##                              snRNA                                TEC 
##                                  5                                120 
##                          TR_C_gene   transcribed_processed_pseudogene 
##                                  4                                 67 
##     transcribed_unitary_pseudogene transcribed_unprocessed_pseudogene 
##                                 39                                233 
##                 unitary_pseudogene             unprocessed_pseudogene 
##                                  1                                 41
# Now, normalization by the method of trimmed mean of M-values (TMM)
dge.filtered.reps.norm <- calcNormFactors(dge.filtered.reps, method = "TMM")

# norm factor summary
summary(dge.filtered.reps.norm$samples$norm.factors)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.7129  0.9740  1.0318  1.0051  1.0685  1.1777
log2cpm.norm.reps <- edgeR::cpm(dge.filtered.reps.norm, log = T)
nsamples <- ncol(dge.filtered.reps.norm)
boxplot(log2cpm.norm.reps, 
        main="Filtered normalized lcpm data with replicates", 
        xlab="RIN", 
        ylab=expression('Counts per gene (Log'[2]~'CPM)'),
        axes=FALSE)
axis(2)
axis(1,at=1:nsamples,labels=(dge.filtered.reps.norm$samples$RIN),las=2,cex.axis=0.8)

path <- paste0("../../results/", tool, "/boxplot/", condition, "_gene_boxplot_with_reps_with_RIN_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 18, height = 6)
## png 
##   2
saveToPNG(paste0(path, ".png"), width = 18, height = 6, unit = "in", res = 300)
## png 
##   2

Sum technical replicates

# sum technical replicates
dim(dge)
## [1] 60612    94
dge.tech <- sumTechReps(dge, dge$samples$NPID)
dim(dge.tech$counts)
## [1] 60612    47
colnames(dge.tech$counts) <- dge.tech$samples$NPID

Raw MDS

# set colors and get data
group_colors <- c("black", "red")[dge.tech$samples$TYPE]
lcpm <- edgeR::cpm(dge.tech$counts, log = TRUE)

par(bg = 'white')

# plot MDS
plotMDS(
  lcpm, 
  top = 100, 
  labels = dge.tech$samples$NPID,
  cex = .8, 
  dim.plot = c(1,2), 
  plot = TRUE, 
  col = group_colors,
  gene.selection = "common"
)
title(expression('Top 100 Genes - Raw (Log'[2]~'CPM)'))

#legend(
#  "top",
#  legend = c(control, condition),
#  pch = 16,
#  col = c(control_color, condition_color),
#  cex = 0.8
#)
# save
path <- paste0("../../results/", tool, "/MDS/", condition, "_gene_MDS_dim1&2_raw_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 4, height = 4)
## png 
##   2
saveToPNG(paste0(path, ".png"), width = 4, height = 4, unit = "in", res = 300)
## png 
##   2
# plot MDS
plotMDS(
  lcpm, 
  top = 100, 
  labels = dge.tech$samples$RIN,
  cex = .8, 
  dim.plot = c(1,2), 
  plot = TRUE, 
  col = group_colors,
  gene.selection = "common"
)
title(expression('Top 100 Genes - Raw (Log'[2]~'CPM)'))

#legend(
#  "top",
#  legend = c(control, condition),
#  pch = 16,
#  col = c(control_color, condition_color),
#  cex = 0.8
#)
# save
path <- paste0("../../results/", tool, "/MDS/", condition, "_gene_MDS_dim1&2_raw_", sex_karyotype, "_label_RIN")
saveToPDF(paste0(path, ".pdf"), width = 4, height = 4)
## png 
##   2
saveToPNG(paste0(path, ".png"), width = 4, height = 4, unit = "in", res = 300)
## png 
##   2
# plot MDS
plotMDS(
  lcpm, 
  top = 100, 
  labels = dge.tech$samples$RIN,
  cex = .8, 
  dim.plot = c(2,3), 
  plot = TRUE, 
  col = group_colors,
  gene.selection = "common"
)
title(expression('Top 100 Genes - Raw (Log'[2]~'CPM)'))

path <- paste0("../../results/", tool, "/MDS/", condition, "_gene_MDS_dim2&3_raw_", sex_karyotype, "_label_RIN")
saveToPDF(paste0(path, ".pdf"), width = 4, height = 4)
## png 
##   2
saveToPNG(paste0(path, ".png"), width = 4, height = 4, unit = "in", res = 300)
## png 
##   2

Filter lowly expressed genes

The filterByExpr() function in the edgeR package determines which genes have a great enough count value to keep. We will filter by group. This means at least 6 samples (6 is the smallest group sample size) must express a minimum count of 10 (in cpm, default value).

keep.expr <- filterByExpr(dge.tech, group = dge.tech$samples$TYPE)
dim(dge.tech)
## [1] 60612    47
dge.filtered <- dge.tech[keep.expr, , keep.lib.sizes = FALSE]

dim(dge.filtered)
## [1] 19306    47
table(dge.filtered$genes$gene_type)
## 
##                          IG_C_gene                             lncRNA 
##                                  1                               3384 
##                              miRNA                           misc_RNA 
##                                 11                                 21 
##             polymorphic_pseudogene               processed_pseudogene 
##                                  6                                277 
##                     protein_coding                             scaRNA 
##                              14883                                  1 
##                              scRNA                             snoRNA 
##                                  1                                 13 
##                              snRNA                                TEC 
##                                 20                                172 
##                          TR_C_gene                          TR_J_gene 
##                                  4                                  1 
##   transcribed_processed_pseudogene     transcribed_unitary_pseudogene 
##                                 89                                 53 
## transcribed_unprocessed_pseudogene                 unitary_pseudogene 
##                                288                                  4 
##             unprocessed_pseudogene 
##                                 77

TMM normalization

# Now, normalization by the method of trimmed mean of M-values (TMM)
dge.filtered.norm <- calcNormFactors(dge.filtered, method = "TMM")

# norm factor summary
summary(dge.filtered.norm$samples$norm.factors)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.7340  0.9747  1.0350  1.0049  1.0619  1.1793

Density plot

Density plots of log-intensity distribution of each library can be superposed on a single graph for a better comparison between libraries and for identification of libraries with weird distribution.

# set graphical parameter
par(mfrow = c(1,3))

# Normalize data for library size and expression intesntiy
log2cpm.tech <- edgeR::cpm(dge.tech, log = TRUE)
log2cpm.filtered <- edgeR::cpm(dge.filtered, log = TRUE)
log2cpm.norm <- edgeR::cpm(dge.filtered.norm, log = TRUE)

# set colors
colors <- c("red","orange","green","yellow","blue","purple", 
            "lightgray","brown","pink","cyan")
nsamples <- ncol(dge.tech)

# First, plot the first column of the log2cpm.tech density
plot(density(log2cpm.tech[,1]), col = colors[1], lwd = 2, ylim = c(0,0.25), 
     las = 2, main = "A. Raw", xlab = expression('Log'[2]~CPM))

# For each sample plot the lcpm density
for (i in 2:nsamples){
  den <- density(log2cpm.tech[,i]) #subset each column
  lines(den$x, den$y, col = colors[i], lwd = 2) 
}

# Second, plot log2cpm.filtered
plot(density(log2cpm.filtered[,1]), col = colors[1], lwd = 2, ylim = c(0,0.25), 
     las = 2, main = "B. Filtered", xlab = expression('Log'[2]~CPM))
abline(v = edgeR::cpm(3, log = TRUE), lty = 3)
for (i in 2:nsamples) {
  den <- density(log2cpm.filtered[,i])
  lines(den$x, den$y, col = colors[i], lwd = 2)
}

# Third, plot log2cpm.norm
plot(density(log2cpm.norm[,1]), col = colors[1], lwd = 2, ylim = c(0,0.25), 
     las = 2, main = "C. Normalized", xlab = expression('Log'[2]~CPM))
abline(v = edgeR::cpm(3, log = TRUE), lty = 3)
for (i in 2:nsamples) {
  den <- density(log2cpm.norm[,i])
  lines(den$x, den$y, col = colors[i], lwd = 2)
}

# save
path <- paste0("../../results/", tool, "/density/", condition, "_gene_density_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 6, height = 4)
## png 
##   2
saveToPNG(paste0(path, ".png"), width = 6, height = 4, unit = "in", res = 300)
## png 
##   2

Boxplots

# set parameters
par(mfrow = c(1,3))

# First look at dge.tech
boxplot(log2cpm.tech, 
        main="A. Raw", 
        xlab="", 
        ylab=expression('Counts per gene (Log'[2]~'CPM)'),
        axes=FALSE,
        col = colors
        )
axis(2) # 2 = left 
axis(1, # 1 = below 
     at = 1:nsamples, # points at which tick-marks should be drawn
     labels = colnames(log2cpm.tech),
     las = 2,
     cex.axis = 0.8 # size of axis
     )

# Second, look at dge.filtered
boxplot(log2cpm.filtered, 
        main="B. Filtered", 
        xlab="", 
        ylab=expression('Counts per gene (Log'[2]~'CPM)'),
        axes=FALSE,
        col = colors
        )
axis(2)
axis(1, at=1:nsamples,labels=colnames(log2cpm.filtered),las=2,cex.axis=0.8)

# Third, look at dge.norm
boxplot(log2cpm.norm, 
        main="C. Normalized", 
        xlab="", 
        ylab=expression('Counts per gene (Log'[2]~'CPM)'),
        axes=FALSE,
        col = colors)
axis(2)
axis(1,at=1:nsamples,labels=colnames(log2cpm.norm),las=2,cex.axis=0.8)

# save
path <- paste0("../../results/", tool, "/boxplot/", condition, "_gene_boxplot_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 10, height = 6)
## png 
##   2
saveToPNG(paste0(path, ".png"), width = 10, height = 6, unit = "in", res = 300)
## png 
##   2

RIN check with replicates summed

boxplot(log2cpm.norm, 
        main="Filtered normalized lcpm data", 
        xlab="RIN", 
        ylab=expression('Counts per gene (Log'[2]~'CPM)'),
        axes=FALSE,
        col = colors)
axis(2)
axis(1,at=1:nsamples,labels=(dge.filtered.norm$samples$RIN),las=2,cex.axis=0.8)

path <- paste0("../../results/", tool, "/boxplot/", condition, "_gene_boxplot_sum_reps_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 12, height = 6)
## png 
##   2
saveToPNG(paste0(path, ".png"), width = 12, height = 6, unit = "in", res = 300)
## png 
##   2
# check if there is correlation between RIN and library size
box <- dge.filtered.norm$samples
plot(box$RIN, box$lib.size)

cor(box$RIN, box$lib.size, method = c("pearson", "kendall", "spearman"))
## [1] 0.6487225
cor.test(box$RIN, box$lib.size, method=c("pearson", "kendall", "spearman"))
## 
##  Pearson's product-moment correlation
## 
## data:  box$RIN and box$lib.size
## t = 5.7183, df = 45, p-value = 8.164e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4443308 0.7889202
## sample estimates:
##       cor 
## 0.6487225
# is the data normally distrubited 
ggqqplot(box$lib.size, ylab = "library size")

# wt
ggqqplot(box$RIN, ylab = "RIN")

res <- cor.test(box$lib.size, box$RIN, 
                method = "pearson")
res
## 
##  Pearson's product-moment correlation
## 
## data:  box$lib.size and box$RIN
## t = 5.7183, df = 45, p-value = 8.164e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4443308 0.7889202
## sample estimates:
##       cor 
## 0.6487225
ggscatter(box, x = "RIN", y = "lib.size", 
          add = "reg.line", conf.int = TRUE, 
          cor.coef = TRUE, cor.method = "pearson",
          ylab = "library size", xlab = "RIN value") 
## `geom_smooth()` using formula 'y ~ x'

path <- paste0("../../results/", tool, "/library/", condition, "_corr_RIN_lib_size_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 6, height = 6)
## png 
##   2
saveToPNG(paste0(path, ".png"), width = 6, height = 6, unit = "in", res = 300)
## png 
##   2

CCA Heatmap

#form <- ~ TYPE + Age + Sex + ClinicalDx +lib.size + LBD.type +CDLB
form <- ~ TYPE + PathDx + AD.subtype + LBD.type + CDLB + Braak.NFT + Thal.amyloid + MF.SP + MF.NFT + MF.LB + Cing.LB + MF.Amyloid + MF.Tau + Cing.Synuclein + CWOW.Category + VaD  + TDP.type + Brain.wt + ClinicalDx + FHx + Duration + Age  + PMI + APOE + MAPT + GRN + TMEM106b + RIN + Total.RNA.ng 
# removed Race since there is only white samples in this set. 

# Compute Canonical Correlation Analysis (CCA)
# between all pairs of variables
# returns absolute correlation value
info <- as.data.frame(dge.filtered.norm$samples)
C = canCorPairs( form, info)
## Warning in canCorPairs(form, info): Regression model may be problematic.
## High colinearity between variables:
##   TYPE and PathDx
##   TYPE and LBD.type
##   TYPE and CDLB
##   TYPE and CWOW.Category
##   PathDx and VaD
##   MF.Amyloid and CWOW.Category
##   MF.Tau and CWOW.Category
##   Cing.Synuclein and CWOW.Category
# Plot correlation matrix
#plotCorrMatrix( C , key.xlab = "correlation")

# replace NA with zero
C[is.na(C)] = 0
cor.mtest <- function(C, ...) {
    C <- as.matrix(C)
    n <- ncol(C)
    p.mat<- matrix(NA, n, n)
    diag(p.mat) <- 0
    for (i in 1:(n - 1)) {
        for (j in (i + 1):n) {
            tmp <- cor.test(C[, i], C[, j], ...)
            p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
        }
    }
  colnames(p.mat) <- rownames(p.mat) <- colnames(C)
  p.mat
}
# matrix of the p-value of the correlation
p.mat <- cor.mtest(C)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(C, method="color", col=col(200),  
         type="upper", order="hclust", 
         addCoef.col = "black", # Add coefficient of correlation
         tl.col="black", tl.srt=45, #Text label color and rotation
         # Combine with significance
        # p.mat = p.mat, sig.level = 0.1, insig = "blank", 
         # hide correlation coefficient on the principal diagonal
         diag=FALSE 
         )

path <- paste0("../../results/", tool ,"/varpart/LBD_gene_CCA", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 25, height = 25)
## png 
##   2
saveToPNG(paste0(path, ".png"), width = 23, height = 25, unit = "in", res = 300)
## png 
##   2
# CCA for only a handful of variables 
form <- ~ TYPE +  Brain.wt + Duration + Age + PMI + RIN
# removed Race since there is only white samples in this set. 

# Compute Canonical Correlation Analysis (CCA)
# between all pairs of variables
# returns absolute correlation value
info <- as.data.frame(dge.filtered.norm$samples)
C = canCorPairs( form, info)
# Plot correlation matrix
#plotCorrMatrix( C , key.xlab = "correlation")

# replace NA with zero
C[is.na(C)] = 0
cor.mtest <- function(C, ...) {
    C <- as.matrix(C)
    n <- ncol(C)
    p.mat<- matrix(NA, n, n)
    diag(p.mat) <- 0
    for (i in 1:(n - 1)) {
        for (j in (i + 1):n) {
            tmp <- cor.test(C[, i], C[, j], ...)
            p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
        }
    }
  colnames(p.mat) <- rownames(p.mat) <- colnames(C)
  p.mat
}
# matrix of the p-value of the correlation
p.mat <- cor.mtest(C)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(C, method="color", col=col(200),  
         type="upper", order="hclust", 
         addCoef.col = "black", # Add coefficient of correlation
         tl.col="black", tl.srt=45, #Text label color and rotation
         # Combine with significance
        # p.mat = p.mat, sig.level = 0.1, insig = "blank", 
         # hide correlation coefficient on the principal diagonal
         diag=FALSE 
         )

Save R object

saveRDS(dge.filtered.norm, file = paste0("../../rObjects/", condition, "_dge.filtered.norm", sex_karyotype, ".rds"))
#dge.filtered.norm <- readRDS( file = paste0("../../rObjects/", condition, "_dge.filtered.norm", sex_karyotype, ".rds"))

Variance partition

register(SnowParam(detectCores())) # work in parallel (takes a while to run)

# geneExpr: matrix of gene expression values
# info: information/metadata about each sample
geneExpr <- as.matrix(dge.filtered.norm$counts)
info <- as.data.frame(dge.filtered.norm$samples)
info$RIN <- as.integer(info$RIN)
info$TYPE <- as.character(info$TYPE)

# age is usually a continuous so model it as a fixed effect "age"
# group is categorical, so model them as random effects "(1|group)"
# note the syntax
form <- ~Age + PMI + RIN + (1|TYPE)# + (1 | CWOW.Category)
#
#(1 | LBD.type) + 
#(1 | TDP.type)  + 
#(1 | ClinicalDx) + 
#(1 | FHx) + 
#(1 | Sex) + 
#(1 | Race) + 
#(1 | TMEM106b) +
#Braak.NFT + 
#Thal.amyloid + 
#MF.SP  + 
#MF.LB + 
#Cing.LB + 
#MF.Amyloid + 
#MF.Tau + 
#Cing.Synuclein + 
#VaD + 
#TDP.43 + 
#Brain.wt + 
#Duration + 
#Age + 
#PMI + 
#RIN

varPart <- fitExtractVarPartModel(geneExpr, form, info)
## Warning in .fitExtractVarPartModel(exprObj, formula, data, REML = REML, : Variables contain NA's: PMI 
## Samples with missing data will be dropped.
## Warning in optwrap(optimizer, devfun, getStart(start, rho$pp), lower =
## rho$lower, : convergence code -4 from nloptwrap: NLOPT_ROUNDOFF_LIMITED:
## Roundoff errors led to a breakdown of the optimization algorithm. In this case,
## the returned minimum may still be useful. (e.g. this error occurs in NEWUOA if
## one tries to achieve a tolerance too close to machine precision.)
vp <- sortCols(varPart)
# plot
plotVarPart(vp)

# save
path <- paste0("../../results/", tool, "/varpart/", condition, "_gene_varpart_violins_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 6, height = 4)
## png 
##   2
saveToPNG(paste0(path, ".png"), width = 6, height = 4, unit = "in", res = 300)
## png 
##   2

Add gene name to gene_id

# plot
plotPercentBars(vp[1:10,])

# make a column called gene_id to later merge with genes.gtf
vp$gene_id <- row.names(vp)
# dataframe of gene_id and gene_name 
gene_id_name <- cbind(genes.gtf$gene_id, genes.gtf$gene_name)
df <- as.data.frame(gene_id_name)
names(df)[names(df) == 'V1'] <- 'gene_id'
names(df)[names(df) == 'V2'] <- 'gene_name'

vp_df <- merge(vp, df)
row.names(vp_df) <- make.names(vp_df$gene_name, unique = TRUE)
vp_df$gene_id <- NULL
vp_df$gene_name <- NULL

plotPercentBars(vp_df[1:10,])

# save
path <- paste0("../../results/", tool, "/varpart/", condition, "_gene_percent_bars", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 6, height = 6)
## png 
##   2
saveToPNG(paste0(path, ".png"), width = 6, height = 6, unit = "in", res = 300)
## png 
##   2

Top variable genes

varPart.df <- as.data.frame(vp_df)
# sort genes based on variance explained by group
order.varPart.df <- varPart.df[order(varPart.df$RIN, decreasing = TRUE),]
head(order.varPart.df["RIN"], 10)

# sort genes based on variance explained by Race
order.varPart.df <- varPart.df[order(varPart.df$Race, decreasing = TRUE),]
head(order.varPart.df["Race"], 10)

Design matrix

age <- as.numeric(dge.filtered.norm$samples$Age)
RIN <- as.numeric(dge.filtered.norm$samples$RIN)
race <- as.factor(dge.filtered.norm$samples$Race)

group <- interaction(dge.filtered.norm$samples$TYPE)

design <- model.matrix(~ 0 + group)
colnames(design) <- c(control,condition)

design
##    CONTROL LBD
## 1        1   0
## 2        1   0
## 3        0   1
## 4        0   1
## 5        0   1
## 6        0   1
## 7        0   1
## 8        0   1
## 9        0   1
## 10       0   1
## 11       0   1
## 12       0   1
## 13       0   1
## 14       1   0
## 15       1   0
## 16       0   1
## 17       0   1
## 18       0   1
## 19       1   0
## 20       0   1
## 21       0   1
## 22       1   0
## 23       1   0
## 24       0   1
## 25       0   1
## 26       0   1
## 27       0   1
## 28       0   1
## 29       0   1
## 30       0   1
## 31       0   1
## 32       0   1
## 33       0   1
## 34       0   1
## 35       0   1
## 36       0   1
## 37       0   1
## 38       0   1
## 39       0   1
## 40       0   1
## 41       0   1
## 42       0   1
## 43       0   1
## 44       0   1
## 45       1   0
## 46       0   1
## 47       0   1
## attr(,"assign")
## [1] 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"

Voom

# voom transform counts
v <- voomWithQualityWeights(dge.filtered.norm,
                            design,
                            plot = TRUE)

# save
path <- paste0("../../results/", tool, "/voom/", condition, "_gene_mean_var_weights_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 6, height = 4)
## png 
##   2
saveToPNG(paste0(path, ".png"), width = 6, height = 4, unit = "in", res = 300)
## png 
##   2
# fits linear model for each gene given a series of arrays
fit <- lmFit(v, design)

# contrast design for differential expression
contrasts <- makeContrasts(
  title = myContrasts,  # myContrasts was user input from beginning
  levels = colnames(design))
head(contrasts)
##          Contrasts
## Levels    LBD - CONTROL
##   CONTROL            -1
##   LBD                 1
# save contrast names
allComparisons <- colnames(contrasts)
allComparisons # check
## [1] "LBD - CONTROL"
# run contrast analysis
vfit <- contrasts.fit(fit, contrasts = contrasts)

# Compute differential expression based on the empirical Bayes moderation of the
# standard errors towards a common value.
veBayesFit <- eBayes(vfit)
plotSA(veBayesFit, main = "Final Model: Mean-variance Trend")

# save
path <- paste0("../../results/", tool, "/voom/", condition, "_gene_final_mean_var_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 6, height = 4)
## png 
##   2
saveToPNG(paste0(path, ".png"), width = 6, height = 4, unit = "in", res = 300)
## png 
##   2

Voom MDS Plot

group_colors <- c("black", "red")[v$targets$TYPE]
names <- v$targets$NPID

plotMDS(
  v, 
  top = 100, 
  labels = names,
  cex = 1, 
  dim.plot = c(1,2), 
  plot = TRUE, 
  col = group_colors
)

title(expression('Top 100 Genes - Voom (Log'[2]~'CPM)'))

# save
# save
path <- paste0("../../results/", tool, "/MDS/", condition, "_gene_MDS_normalized_dim1&2_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 5, height = 5)
## png 
##   2
saveToPNG(paste0(path, ".png"), width = 4, height = 4, unit = "in", res = 300)
## png 
##   2
plotMDS(
  v, 
  top = 100, 
  labels = names,
  cex = 1, 
  dim.plot = c(2,3), 
  plot = TRUE, 
  col = group_colors
)

title(expression('Top 100 Genes - Voom (Log'[2]~'CPM)'))

# save
# save
path <- paste0("../../results/", tool, "/MDS/", condition, "_gene_MDS_normalized_dim2&3_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 5, height = 5)
## png 
##   2
saveToPNG(paste0(path, ".png"), width = 4, height = 4, unit = "in", res = 300)
## png 
##   2
saveRDS(v, file = paste0("../../rObjects/", condition, "_gene_voom", sex_karyotype, ".rds"))

Number of DEGs

Identify number of differential expressed genes.

pval <- 0.05

sumTable <- 
  summary(decideTests(
    vfit,  # object
    adjust.method = "BH", # by default the method = "separate"
    p.value = pval,
    lfc = 0  # numeric, minimum absolute log2-fold change required
  ))

print(paste0(" FDRq < ", pval))
## [1] " FDRq < 0.05"
sumTable
##        LBD - CONTROL
## Down             160
## NotSig         19067
## Up                79

Output DEG tables

coef <- 1

for (i in allComparisons) {
  # p < 1, log2fc > 0 
  vTopTableAll <-
    topTable(
      veBayesFit, 
      coef = coef,  
      n = Inf, 
      p.value = 1,
      lfc = 0 
    )
  path <- paste("../../results/", tool, "/DEGs/", condition, "_gene_DEGs_FDRq1.00_", sex_karyotype, ".txt", sep = "") 
  write.table(
    vTopTableAll,
    path,
    sep = "\t",
    row.names = FALSE,
    quote = FALSE
  )
  
  # p < 0.05, log2fc > 0
  vTopTable1 <-
    topTable( 
      veBayesFit,  
      coef = coef,  
      n = Inf, 
      p.value = 0.05,
      lfc = 0
    )
  path <- paste("../../results/", tool, "/DEGs/", condition, "_gene_DEGs_FDRq0.05_", sex_karyotype, ".txt", sep = "") 
  write.table(
    vTopTable1,
    path,
    sep = "\t",
    row.names = FALSE,
    quote = FALSE
  )
  # increment -----------------------------------------------------------------
  coef <- coef + 1
}

Read and save table with all genes (FDRq = 1).

condition_vs_control <- read.table(
  paste0("../../results/", tool, "/DEGs/", condition, "_gene_DEGs_FDRq1.00_", sex_karyotype, ".txt", sep = ""),
  header = TRUE,
  sep = "\t",
  stringsAsFactors = FALSE)

saveRDS(condition_vs_control, file = paste0("../../rObjects/", condition, "_gene_table_", sex_karyotype, ".rds"))

Assign colors

Assign colors values based on FDRq cutoff of 0.2.

color_values <- vector()
max <- nrow(condition_vs_control)

for(i in 1:max){
  if (condition_vs_control$adj.P.Val[i] < 0.2){
    if (condition_vs_control$logFC[i] > 0){
      color_values <- c(color_values, 1) # 1 when logFC > 0 and FDRq < 0.05
    }
    else if (condition_vs_control$logFC[i] < 0){
      color_values <- c(color_values, 2) # 2 when logFC < 0 and FDRq < 0.05
    }
  }
  else{
    color_values <- c(color_values, 3) # 3 when FDRq >= 0.05
  }
}

condition_vs_control$color_p0.05 <- factor(color_values)

Subset genes to label

Subset the top 10 up and down-regulated genes

up <- condition_vs_control[condition_vs_control$color_p0.05 == 1,]
up10 <- up[1:10,]

down <- condition_vs_control[condition_vs_control$color_p0.05 == 2,]
down <- subset(down, down$logFC < -1.5)
down10 <- down[1:7,]

Volcano plot

hadjpval <- (-log10(max(
  condition_vs_control$P.Value[condition_vs_control$adj.P.Val < 0.2], 
  na.rm=TRUE)))

p_vol <-
  ggplot(data = condition_vs_control, 
         aes(x = logFC,  # x-axis is logFC
             y = -log10(P.Value),  # y-axis will be -log10 of P.Value
             color = color_p0.05)) +  # color is based on factored color column
  geom_point(alpha = 0.8, size = 1.7) +  # create scatterplot, alpha makes points transparent
  theme_bw() +  # set color theme
  theme(legend.position = "none") +  # no legend
  scale_color_manual(values = c("red", "blue","grey")) +  # set factor colors
  labs(
    title = "", # no main title
    x = expression(log[2](FC)), # x-axis title
    y = expression(-log[10] ~ "(" ~ italic("p") ~ "-value)") # y-axis title
  ) +
  theme(axis.title.x = element_text(size = 10),
        axis.text.x = element_text(size = 10)) +
  theme(axis.title.y = element_text(size = 10),
        axis.text.y = element_text(size = 10)) +
  geom_hline(yintercept = hadjpval,  #  horizontal line
                     colour = "#000000",
                     linetype = "dashed") +
  ggtitle("LBD vs Control") +
  theme(plot.title = element_text(size = 10)) +
  geom_text_repel(data = up10,
                  aes(x = logFC, y= -log10(P.Value), label = gene_name), 
                  color = "maroon", 
                  fontface="italic",
                  size = 3, 
                  max.overlaps = getOption("ggrepel.max.overlaps", default = 30)
                  ) +
  geom_text_repel(data = down10,
                  aes(x = logFC, y= -log10(P.Value), label = gene_name), 
                  color = "navyblue", 
                  fontface="italic",
                  size = 3, 
                  max.overlaps = getOption("ggrepel.max.overlaps", default = 15)
                  ) # +
 # scale_y_continuous(breaks = seq(0,12,by=1), limits = c(0,12)) +
 # scale_x_continuous(breaks = seq(-8,8,by=2), limits = c(-8,8))
p_vol

# save
path <- paste0("../../results/", tool, "/volcano/", condition, "_gene_volcano_FDRq0.05_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 8, height = 6)
## png 
##   2
saveToPNG(paste0(path, ".png"), width = 8, height = 6, unit = "in", res = 300)
## png 
##   2

Glimma

row.names(dge.filtered.norm$genes) <- make.names(dge.filtered.norm$genes$gene_name, unique = T)
glimmaVolcano(veBayesFit, dge = dge.filtered.norm, groups = dge.filtered.norm$samples$TYPE)
help(glimmaVolcano)
dt <- decideTests(veBayesFit)
summary(veBayesFit)
##                  Length Class      Mode     
## coefficients     19306  -none-     numeric  
## stdev.unscaled   19306  -none-     numeric  
## sigma            19306  -none-     numeric  
## df.residual      19306  -none-     numeric  
## cov.coefficients     1  -none-     numeric  
## pivot                2  -none-     numeric  
## rank                 1  -none-     numeric  
## genes               26  data.frame list     
## Amean            19306  -none-     numeric  
## method               1  -none-     character
## design              94  -none-     numeric  
## contrasts            2  -none-     numeric  
## df.prior             1  -none-     numeric  
## s2.prior             1  -none-     numeric  
## var.prior            1  -none-     numeric  
## proportion           1  -none-     numeric  
## s2.post          19306  -none-     numeric  
## t                19306  -none-     numeric  
## df.total         19306  -none-     numeric  
## p.value          19306  -none-     numeric  
## lods             19306  -none-     numeric  
## F                19306  -none-     numeric  
## F.p.value        19306  -none-     numeric
#glMDPlot(veBayesFit, coef=1, status=dt, main=colnames(efit)[1],
#         side.main="ENTREZID", counts=v$E, groups=TYPE, launch=T)

TYPE <- as.factor(dge.filtered.norm$samples$TYPE)
library(gplots)
basal.vs.lp <- topTreat(veBayesFit, coef=1, n=Inf)
basal.vs.lp.topgenes <- basal.vs.lp$gene_name[1:100]
i <- which(v$genes$gene_name %in% basal.vs.lp.topgenes)
mycol <- colorpanel(1000,"blue","white","red")
heatmap.2(v$E[i,], scale="row",
   labRow=v$genes$gene_name[i], labCol=TYPE, 
   col=mycol, trace="none", density.info="none", 
   margin=c(8,6), lhei=c(2,10), dendrogram="column")
## Error in plot.new(): figure margins too large

path <- paste0("../../results/", tool, "/volcano/LBD_gene_volcano_FDRq0.05_heatmap", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 12, height = 8)
## png 
##   2
saveToPNG(paste0(path, ".png"), width = 8, height = 6, unit = "in", res = 300)
## png 
##   2

Bar plot gene of interest

GOI <- v[v$genes$gene_name %in% "TTR",]
# Barplot
out_df <- as.data.frame(GOI$E)
df_melt <- melt(out_df)
## Using  as id variables
hist(df_melt$value, breaks = 40)

df_melt$NPID <- df_melt$variable
df_melt <- merge(df_melt, metadata, by = "NPID")
ggplot(df_melt, aes(reorder(x=variable, -value), y=value, color = TYPE, fill = TYPE)) + geom_bar(stat = "identity")

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Server 7.9 (Maipo)
## 
## Matrix products: default
## BLAS:   /usr/lib64/libblas.so.3.4.2
## LAPACK: /usr/lib64/liblapack.so.3.4.2
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggpubr_0.4.0              corrplot_0.92            
##  [3] plyr_1.8.6                Glimma_2.0.0             
##  [5] reshape_0.8.8             variancePartition_1.25.10
##  [7] stringr_1.4.0             rtracklayer_1.50.0       
##  [9] GenomicRanges_1.42.0      GenomeInfoDb_1.26.7      
## [11] IRanges_2.24.1            S4Vectors_0.28.1         
## [13] BiocGenerics_0.36.1       gplots_3.1.1             
## [15] ggrepel_0.9.1             ggplot2_3.3.5            
## [17] edgeR_3.32.1              limma_3.46.0             
## [19] dplyr_1.0.8               BiocParallel_1.24.1      
## 
## loaded via a namespace (and not attached):
##   [1] minqa_1.2.4                 colorspace_2.0-3           
##   [3] ggsignif_0.6.3              ellipsis_0.3.2             
##   [5] XVector_0.30.0              rstudioapi_0.13            
##   [7] farver_2.1.0                bit64_4.0.5                
##   [9] AnnotationDbi_1.52.0        fansi_1.0.2                
##  [11] codetools_0.2-18            splines_4.0.3              
##  [13] doParallel_1.0.17           cachem_1.0.6               
##  [15] geneplotter_1.68.0          knitr_1.37                 
##  [17] jsonlite_1.8.0              nloptr_1.2.2.3             
##  [19] Rsamtools_2.6.0             RhpcBLASctl_0.21-247.1     
##  [21] pbkrtest_0.5.1              broom_0.7.12               
##  [23] annotate_1.68.0             aod_1.3.1                  
##  [25] compiler_4.0.3              httr_1.4.2                 
##  [27] backports_1.4.1             assertthat_0.2.1           
##  [29] Matrix_1.4-0                fastmap_1.1.0              
##  [31] cli_3.2.0                   htmltools_0.5.2            
##  [33] prettyunits_1.1.1           tools_4.0.3                
##  [35] gtable_0.3.0                glue_1.6.2                 
##  [37] GenomeInfoDbData_1.2.4      reshape2_1.4.4             
##  [39] Rcpp_1.0.8.3                carData_3.0-5              
##  [41] Biobase_2.50.0              jquerylib_0.1.4            
##  [43] vctrs_0.3.8                 Biostrings_2.58.0          
##  [45] nlme_3.1-155                iterators_1.0.14           
##  [47] xfun_0.30                   rbibutils_2.2.7            
##  [49] lme4_1.1-28                 lifecycle_1.0.1            
##  [51] gtools_3.9.2                rstatix_0.7.0              
##  [53] XML_3.99-0.8                zlibbioc_1.36.0            
##  [55] MASS_7.3-55                 scales_1.1.1               
##  [57] hms_1.1.1                   MatrixGenerics_1.2.1       
##  [59] SummarizedExperiment_1.20.0 RColorBrewer_1.1-2         
##  [61] yaml_2.3.5                  memoise_2.0.1              
##  [63] sass_0.4.0                  stringi_1.7.6              
##  [65] RSQLite_2.2.9               highr_0.9                  
##  [67] genefilter_1.72.1           foreach_1.5.2              
##  [69] caTools_1.18.2              boot_1.3-28                
##  [71] Rdpack_2.1.4                rlang_1.0.2                
##  [73] pkgconfig_2.0.3             matrixStats_0.61.0         
##  [75] bitops_1.0-7                evaluate_0.15              
##  [77] lattice_0.20-45             purrr_0.3.4                
##  [79] labeling_0.4.2              GenomicAlignments_1.26.0   
##  [81] htmlwidgets_1.5.4           bit_4.0.4                  
##  [83] tidyselect_1.1.2            magrittr_2.0.2             
##  [85] DESeq2_1.30.1               R6_2.5.1                   
##  [87] generics_0.1.2              DelayedArray_0.16.3        
##  [89] DBI_1.1.2                   mgcv_1.8-39                
##  [91] pillar_1.7.0                withr_2.5.0                
##  [93] abind_1.4-5                 survival_3.2-13            
##  [95] RCurl_1.98-1.5              tibble_3.1.6               
##  [97] car_3.0-12                  crayon_1.5.0               
##  [99] KernSmooth_2.23-20          utf8_1.2.2                 
## [101] rmarkdown_2.11              progress_1.2.2             
## [103] locfit_1.5-9.4              grid_4.0.3                 
## [105] blob_1.2.2                  digest_0.6.29              
## [107] xtable_1.8-4                tidyr_1.2.0                
## [109] munsell_0.5.0               bslib_0.3.1